Working Script

Author

Bhairavi

1. Load necessary libraries

pacman::p_load(readxl, tidyverse)

2. Load the dataset

vaByMarketRaw <- read_excel("outputFile -International Visitor Arrivals.xlsx", sheet = "T1", skip = 10)
vaByDemoRaw <- read_excel("outputFile -International Visitor Arrivals.xlsx", sheet = "T2", skip = 10)
vaByStayRaw <- read_excel("outputFile -International Visitor Arrivals.xlsx", sheet = "T3", skip = 9)

3. Tourism Markets

3.1 Cleansing the dataset

This file

  • contains the international visitor arrivals by inbound tourism markets (monthly)
  • excludes arrivals of Malaysians by land
  • feb 1991 has a sharp decline due to Gulf crisis
  • data for Germany prior to 1991 refers to West Germany only
  • all numbers are counts

The following was done to cleanup the vaByMarketRaw dataframe:

  • remove the bottom few rows as they were unnecessary for our visualizations
  • create a new column to assign the value of Region to the respective Countries
  • rename fields and rearrange the columns
  • filter out rows that are not needed anymore
  • pivot date (month-year) and the number of visitors to reduce the number of columns
vaByMarket <- slice(vaByMarketRaw, 2:(62))

colnames(vaByMarket)[1] <- "Data"

vaByMarket$Region <-
  ifelse(vaByMarket$Data %in% c("Brunei Darussalam", "Indonesia", "Malaysia", "Myanmar", "Philippines", "Thailand", "Vietnam", "Other Markets In Southeast Asia"), "Southeast Asia",
  ifelse(vaByMarket$Data %in% c("China", "Hong Kong SAR", "Taiwan", "Other Markets In Greater China"), "Greater China",
  ifelse(vaByMarket$Data %in% c("Japan", "South Korea", "Other Markets In North Asia"), "North Asia",
  ifelse(vaByMarket$Data %in% c("Bangladesh", "India", "Pakistan", "Sri Lanka", "Other Markets In South Asia"), "South Asia",
  ifelse(vaByMarket$Data %in% c("Iran", "Israel", "Kuwait", "Saudi Arabia", "United Arab Emirates", "Other Markets In West Asia"), "West Asia",
  ifelse(vaByMarket$Data %in% c("Canada", "USA", "Other Markets In Americas"), "Americas",
  ifelse(vaByMarket$Data %in% c("Belgium & Luxembourg", "Denmark", "Finland", "France", "Germany", "Italy", "Netherlands", "Norway", "Rep Of Ireland", "Russian Federation", "Spain", "Sweden", "Switzerland", "United Kingdom", "Other Markets In Europe"), "Europe",
  ifelse(vaByMarket$Data %in% c("Australia", "New Zealand", "Other Markets In Oceania"), "Oceania",
  ifelse(vaByMarket$Data %in% c("Egypt", "Mauritius", "South Africa (Rep Of)", "Other Markets In Africa"), "Africa",
  ifelse(vaByMarket$Data %in% c("Others"), "Others", "NA"
  ))))))))))

vaByMarket <- vaByMarket %>%
  select(542, 1, 2:541)

colnames(vaByMarket)[2] <- "Country"

# sapply(vaByMarket, class)

vaByMarket <- vaByMarket %>%
  mutate_at(vars(-one_of("Region", "Country")), as.numeric) %>%
  filter(vaByMarket$Region != "NA") %>%
  pivot_longer(cols = ! c("Region", "Country"), names_to = "Period", values_to = "Visitors") %>%
  mutate(Period = as.Date(paste(Period, "01"), "%Y %B %d")) %>%
  mutate(Year = year(Period))

3.2. Preparations for visualizations

to find the total number of visitors (regardless of date range):

totalVisitors <- sum(vaByMarket$Visitors, na.rm = TRUE)
totalVisitors
[1] 325005202

to find the min and max year:

minYear <- min(year(vaByMarket$Period))
minYear
[1] 1978
maxYear <- max(year(vaByMarket$Period))
maxYear
[1] 2022

to find the country where most visitors are from:

visitorsByCountry <- vaByMarket %>%
  group_by(Country) %>%
  summarize(Visitors = sum(Visitors, na.rm = TRUE))
  
visitorsByCountry <- visitorsByCountry[order(visitorsByCountry$Visitors, decreasing = TRUE), ]
visitorsByCountry
# A tibble: 52 × 2
   Country        Visitors
   <chr>             <dbl>
 1 Indonesia      50918113
 2 China          34648115
 3 Japan          29394650
 4 Australia      24989059
 5 Malaysia       21009122
 6 India          20987273
 7 United Kingdom 15116531
 8 USA            14620454
 9 South Korea    11942521
10 Hong Kong SAR  11765677
# … with 42 more rows
mostFrom <- head(visitorsByCountry$Country, 1)
mostFrom
[1] "Indonesia"

to find the number of countries that visited us:

temp <- vaByMarket[vaByMarket$Year >= 1984 & vaByMarket$Year <= 1990, ]

numCountries <- nrow(temp %>%
    filter(!is.na(Visitors)) %>%
    count(Country))
numCountries
[1] 33

to list out regions

listRegions <- unique(vaByMarket$Region)
listRegions
 [1] "Southeast Asia" "Greater China"  "North Asia"     "South Asia"    
 [5] "West Asia"      "Americas"       "Europe"         "Oceania"       
 [9] "Africa"         "Others"        

to list out countries

listCountry <- unique(vaByMarket$Country)
listCountry
 [1] "Brunei Darussalam"               "Indonesia"                      
 [3] "Malaysia"                        "Myanmar"                        
 [5] "Philippines"                     "Thailand"                       
 [7] "Vietnam"                         "Other Markets In Southeast Asia"
 [9] "China"                           "Hong Kong SAR"                  
[11] "Taiwan"                          "Other Markets In Greater China" 
[13] "Japan"                           "South Korea"                    
[15] "Other Markets In North Asia"     "Bangladesh"                     
[17] "India"                           "Pakistan"                       
[19] "Sri Lanka"                       "Other Markets In South Asia"    
[21] "Iran"                            "Israel"                         
[23] "Kuwait"                          "Saudi Arabia"                   
[25] "United Arab Emirates"            "Other Markets In West Asia"     
[27] "Canada"                          "USA"                            
[29] "Other Markets In Americas"       "Belgium & Luxembourg"           
[31] "Denmark"                         "Finland"                        
[33] "France"                          "Germany"                        
[35] "Italy"                           "Netherlands"                    
[37] "Norway"                          "Rep Of Ireland"                 
[39] "Russian Federation"              "Spain"                          
[41] "Sweden"                          "Switzerland"                    
[43] "United Kingdom"                  "Other Markets In Europe"        
[45] "Australia"                       "New Zealand"                    
[47] "Other Markets In Oceania"        "Egypt"                          
[49] "Mauritius"                       "South Africa (Rep Of)"          
[51] "Other Markets In Africa"         "Others"                         

3.3. Visualizations

plotting visitors across time chart:

install the timetk package (recommended by Prof Kam)

pacman::p_load(timetk, lubridate, ggplot2, plotly, ggHoriPlot, sunburstR, d3r)

Country/Region Sunburst Plot

vaByMarket
# A tibble: 28,080 × 5
   Region         Country           Period     Visitors  Year
   <chr>          <chr>             <date>        <dbl> <dbl>
 1 Southeast Asia Brunei Darussalam 2022-12-01     7324  2022
 2 Southeast Asia Brunei Darussalam 2022-11-01     4273  2022
 3 Southeast Asia Brunei Darussalam 2022-10-01     3733  2022
 4 Southeast Asia Brunei Darussalam 2022-09-01     3809  2022
 5 Southeast Asia Brunei Darussalam 2022-08-01     3426  2022
 6 Southeast Asia Brunei Darussalam 2022-07-01     3239  2022
 7 Southeast Asia Brunei Darussalam 2022-06-01     2202  2022
 8 Southeast Asia Brunei Darussalam 2022-05-01     2271  2022
 9 Southeast Asia Brunei Darussalam 2022-04-01      807  2022
10 Southeast Asia Brunei Darussalam 2022-03-01      280  2022
# … with 28,070 more rows
country_hier <- vaByMarket %>%
  group_by(Region, Country) %>%
  summarize(Visitors = sum(Visitors, na.rm = TRUE))

country_sunburst <- d3_nest(country_hier, value_cols = "Visitors")

sunburst(data = country_sunburst,
         valueField = "Visitors",
         height = 300,
         width = "100%",
         legend = FALSE) 
Legend

Time Series Plot for Overall Trend (timetk)

timeSeriesOverall <- vaByMarket %>%
  group_by(Period) %>%
  summarise(Visitors = sum(Visitors, na.rm = TRUE))

timeSeriesOverall %>%
  plot_time_series(Period, Visitors, .interactive = TRUE)

Boxplot Time Series

timeSeriesOverall %>%
  plot_time_series(Period, Visitors, .interactive = TRUE)

Time Series by Country

minYear <- 2018
maxYear <- 2022

vaByCountry <- vaByMarket %>%
  filter(Country %in% "Russian Federation")
vaByCountry <- vaByCountry[vaByCountry$Year >= minYear & vaByCountry$Year <= maxYear, ]
vaByCountry[is.na(vaByCountry)] = 0

timeSeriesCountry <- vaByCountry %>%
  group_by(Period) %>%
  summarise(Visitors = sum(Visitors, na.rm = TRUE))

timeSeriesCountry %>%
  plot_time_series(Period, Visitors, .interactive = TRUE)

Cycle Plot by Country

countryAvg <- vaByCountry %>%
  mutate(Month = as.character(Period, format = '%b')) %>%
  group_by(Month) %>%
  summarise(avgValue = mean(Visitors)) %>%
  mutate(avgValue = round(avgValue/1000, digits = 0))

cyclePlot <- vaByCountry %>%
  group_by(Period) %>%
  summarise(Visitors = sum(Visitors, na.rm = TRUE)) %>%
  mutate(Month = as.character(Period, format = '%b')) %>%
  mutate(Visitors = round(Visitors/1000, digits = 0))

figCyclePlot <- ggplot() +
  geom_line(data = cyclePlot, aes(x = Period, y = Visitors, group = Month)) +
  geom_hline(data = countryAvg, aes(yintercept = avgValue), colour = "red", size = 0.3) +
  labs(x = "Date", y = "No. of Visitors", title = paste("Visitor Arrivals by Country, ", minYear, " to ", maxYear)) +
  facet_grid(~ factor(Month, levels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"))) +
  theme_bw() +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())

ggplotly(figCyclePlot)

Month/Year Heatmap by Country

vaByMarketCal <- vaByMarket %>%
  mutate(Month = month(Period)) %>%
  filter(Country %in% "Russian Federation")
vaByMarketCal <- vaByMarketCal[vaByMarketCal$Year >= minYear & vaByMarketCal$Year <= maxYear, ]
vaByMarketCal[is.na(vaByMarketCal)] = 0

scaleFUN <- function(x) sprintf("%.0f", x)

figHeatmapPlot <- ggplot(data = vaByMarketCal) +
  geom_tile(aes(x = Month, y = Year, fill = Visitors)) +
  labs(title = paste("Visitor Arrivals by Country, ", minYear, " to ", maxYear)) +
  theme_bw() +
  theme(legend.position = "none") +
  scale_x_continuous(breaks = seq_along(month.name), labels = month.abb) +
  scale_y_continuous(labels = scaleFUN)

ggplotly(figHeatmapPlot)

Compare two regions using the Horizon Plot for Regional Trend?

vaByMarket %>%
  group_by(Period, Region) %>%
  summarise(Visitors = sum(Visitors)) %>%
  filter(Region == "South Asia")  %>%
  ggplot() +
  geom_horizon(aes(x = Period, y =  Visitors), origin = "midpoint", horizonscale = 6) +
  facet_grid(Region~.) +
  scale_fill_hcl(palette = "RdBu") +
  scale_x_date(expand = c(0,0), date_breaks = "3 year", date_labels = " %b %y") +
  theme_minimal() +
  theme(panel.spacing.y = unit(0, "lines"), 
        strip.text.y = element_text(size = 10, angle = 0, hjust = 0),
        legend.position = 'none',
        axis.text.y = element_blank(),
        axis.text.x = element_text(size = 7),
        axis.title.y = element_blank(),
        axis.title.x = element_blank(),
        axis.ticks.y = element_blank(),
        panel.border = element_blank())

Time Series Plot for Regional Trend (ggplotly)

timeseriesRegion <- vaByMarket %>%
  group_by(Region, Period) %>%
  summarise(Visitors = sum(Visitors, na.rm = TRUE)) %>%
  mutate(Visitors = round(Visitors/1000, digits = 0))

figTimeseriesRegion <- ggplot(data = timeseriesRegion, aes(x = Period, y = Visitors)) +
  geom_line(aes(colour = Region)) +
  labs(x = "Date", y = "No. of Visitors (K)", title = "Overall Trend of Visitor Arrivals by Region, 1978 to 2022") +
  theme_bw()

ggplotly(figTimeseriesRegion)

Time Series Plot for Country Trend (ggplotly)

timeseriesCountry <- vaByMarket %>%
  group_by(Country, Period) %>%
  summarise(Visitors = sum(Visitors, na.rm = TRUE)) %>%
  mutate(Visitors = round(Visitors/1000, digits = 0))

figTimeseriesCountry <- ggplot(data = timeseriesCountry, aes(x = Period, y = Visitors)) +
  geom_line(aes(colour = Country)) +
  labs(x = "Date", y = "No. of Visitors (K)", title = "Overall Trend of Visitor Arrivals, 1978 to 2022") +
  theme_bw()

ggplotly(figTimeseriesCountry)

3.4. Forecasting

reference: https://www.r-bloggers.com/2020/06/introducing-modeltime-tidy-time-series-forecasting-using-tidymodels/

loading libraries

pacman::p_load(modeltime, kernlab, reactable, tidymodels)

only keep the date and value columns for forecasting

vaByMarketTable <- vaByMarket %>%
  group_by(Period) %>%
  summarise(visitors = round(sum(Visitors, na.rm = TRUE))/1000) %>% # converted to thousands
  set_names(c("date", "value"))
vaByMarketTable
# A tibble: 540 × 2
   date       value
   <date>     <dbl>
 1 1978-01-01  99.0
 2 1978-02-01  87.0
 3 1978-03-01  95.5
 4 1978-04-01  84.2
 5 1978-05-01  89.7
 6 1978-06-01  78.5
 7 1978-07-01  96.4
 8 1978-08-01 113. 
 9 1978-09-01  87.8
10 1978-10-01  99.6
# … with 530 more rows

let’s create a train/test set”

splits <- vaByMarketTable %>%
  time_series_split(
    assess = "3 years",
    cumulative = TRUE
  )
splits
<Analysis/Assess/Total>
<504/36/540>

let’s plot the train/test split to visualize the split:

splits %>%
  tk_time_series_cv_plan() %>%
  plot_time_series_cv_plan(date, value, .interactive = FALSE)

we will modelling using modeltime and parsnip libraries

Basic Auto ARIMA fitting

modelFitArima <- arima_reg(
  non_seasonal_ar = 2,
  non_seasonal_differences = 1,
  non_seasonal_ma = 2
) %>%
  set_engine("auto_arima") %>%
  fit(value ~ date, training(splits))
modelFitArima
parsnip model object

Series: outcome 
ARIMA(2,1,2)(0,1,2)[12] 

Coefficients:
          ar1     ar2      ma1      ma2     sma1     sma2
      -0.0794  0.7370  -0.0192  -0.9356  -0.5090  -0.1233
s.e.   0.0442  0.0444   0.0240   0.0251   0.0453   0.0401

sigma^2 = 1498:  log likelihood = -2492.86
AIC=4999.71   AICc=4999.94   BIC=5029.09

Prophet

modelFitProphet <- prophet_reg() %>%
  set_engine("prophet", yearly.seasonality = TRUE) %>%
  fit(value ~ date, training(splits))
modelFitProphet
parsnip model object

PROPHET Model
- growth: 'linear'
- n.changepoints: 25
- changepoint.range: 0.8
- yearly.seasonality: 'auto'
- weekly.seasonality: 'auto'
- daily.seasonality: 'auto'
- seasonality.mode: 'additive'
- changepoint.prior.scale: 0.05
- seasonality.prior.scale: 10
- holidays.prior.scale: 10
- logistic_cap: NULL
- logistic_floor: NULL
- extra_regressors: 0

Machine Learning Models - create a pre-processing recipe - create model specs - use workflow to combine model specs, pre-processing and fit model

Pre-processing Recipe:

recipeSpecs <- recipe(value ~ date, training(splits)) %>%
  step_timeseries_signature(date) %>%
  step_dummy(all_nominal())

recipeSpecs %>%
  prep() %>%
  juice()
# A tibble: 504 × 44
   date       value date_index…¹ date_…² date_…³ date_…⁴ date_…⁵ date_…⁶ date_…⁷
   <date>     <dbl>        <dbl>   <int>   <int>   <int>   <int>   <int>   <int>
 1 1978-01-01  99.0    252460800    1978    1977       1       1       1       0
 2 1978-02-01  87.0    255139200    1978    1978       1       1       2       1
 3 1978-03-01  95.5    257558400    1978    1978       1       1       3       2
 4 1978-04-01  84.2    260236800    1978    1978       1       2       4       3
 5 1978-05-01  89.7    262828800    1978    1978       1       2       5       4
 6 1978-06-01  78.5    265507200    1978    1978       1       2       6       5
 7 1978-07-01  96.4    268099200    1978    1978       2       3       7       6
 8 1978-08-01 113.     270777600    1978    1978       2       3       8       7
 9 1978-09-01  87.8    273456000    1978    1978       2       3       9       8
10 1978-10-01  99.6    276048000    1978    1978       2       4      10       9
# … with 494 more rows, 35 more variables: date_day <int>, date_hour <int>,
#   date_minute <int>, date_second <int>, date_hour12 <int>, date_am.pm <int>,
#   date_wday <int>, date_wday.xts <int>, date_mday <int>, date_qday <int>,
#   date_yday <int>, date_mweek <int>, date_week <int>, date_week.iso <int>,
#   date_week2 <int>, date_week3 <int>, date_week4 <int>, date_mday7 <int>,
#   date_month.lbl_01 <dbl>, date_month.lbl_02 <dbl>, date_month.lbl_03 <dbl>,
#   date_month.lbl_04 <dbl>, date_month.lbl_05 <dbl>, …

once the recipe is ready, we can set up the machine learning pipelines.

Elastic Net

  • we will first build the model
  • next, we will make that model into a fitted workflow
modelSpec_glmnet <- linear_reg(penalty = 0.01, mixture = 0.05) %>%
  set_engine("glmnet")

workflowFit_glmnet <- workflow() %>%
  add_model(modelSpec_glmnet) %>%
  add_recipe(recipeSpecs %>%
               step_rm(date)) %>%
  fit(training(splits))

Random Forest - similar process as the Elastic Net

modelSpec_rf <- rand_forest(trees = 500, min_n = 50, mode = "regression") %>%
  set_engine("randomForest")

workflowFit_rf <- workflow() %>%
  add_model(modelSpec_rf) %>%
  add_recipe(recipeSpecs %>%
               step_rm(date)) %>%
  fit(training(splits))

Adding new models - XGBoost - SVM

XGBoost:

recipeSpecsParsnip <- recipeSpecs %>%
  update_role(date, new_role = "ID")

workflowFit_xgboost <- workflow() %>%
  add_model(
    boost_tree(
      trees = 500,
      min_n = 1,
      tree_depth = 15,
      mode = "regression"
    ) %>%
      set_engine("xgboost")
  ) %>%
  add_recipe(recipeSpecsParsnip) %>%
  fit(training(splits))

SVM:

workflowFit_svm <- workflow() %>%
  add_model(
    svm_rbf(
      cost = 1,
      margin = 0.1
    ) %>%
      set_engine("kernlab") %>%
      set_mode("regression")
  ) %>%
  add_recipe(recipeSpecsParsnip) %>%
  fit(training(splits))

New Hybrid Models

  • these combine automated algorithms with machine learning

Prophet Boost

  • algorithm works by modelling the univariate series using Prophet
  • uses regresses supplied via the preprocessing rrrcipe
  • regresses the prophet residuals with the XGBoost model
modelSpec_prophetBoost <- prophet_boost() %>%
  set_engine("prophet_xgboost", yearly.seasonality = TRUE)

workflowFit_prophetBoost <- workflow() %>%
  add_model(modelSpec_prophetBoost) %>%
  add_recipe(recipeSpecs) %>%
  fit(training(splits))

ARIMA Boosted:

workflowFit_arimaBoost <- workflow() %>%
  add_model(
    arima_boost(
      non_seasonal_ar = 2,
      non_seasonal_differences = 1,
      non_seasonal_ma = 2,
      trees = 500,
      min_n = 1,
      tree_depth = 15
    ) %>%
      set_engine("auto_arima_xgboost")
  ) %>%
  add_recipe(recipeSpecs) %>%
  fit(training(splits))

Modeltime Table

  • organizes the models with IDs and creates generic descriptions too help us keep track of our models
modelTable <- modeltime_table(
  modelFitArima,
  modelFitProphet,
  workflowFit_glmnet,
  workflowFit_rf,
  workflowFit_svm,
  workflowFit_xgboost,
  workflowFit_arimaBoost,
  workflowFit_prophetBoost
) %>%
  update_model_description(1, "ARIMA") %>%
  update_model_description(2, "Prophet") %>%
  update_model_description(3, "glmNet") %>%
  update_model_description(4, "Random Forest") %>%
  update_model_description(5, "SVM") %>%
  update_model_description(6, "XGBoost") %>%
  update_model_description(7, "ARIMA Boost") %>%
  update_model_description(8, "Prophet Boost")

modelTable
# Modeltime Table
# A tibble: 8 × 3
  .model_id .model     .model_desc  
      <int> <list>     <chr>        
1         1 <fit[+]>   ARIMA        
2         2 <fit[+]>   Prophet      
3         3 <workflow> glmNet       
4         4 <workflow> Random Forest
5         5 <workflow> SVM          
6         6 <workflow> XGBoost      
7         7 <workflow> ARIMA Boost  
8         8 <workflow> Prophet Boost

Calibration

  • model calibration is used to quantify error and estimate confidence intervals
  • two columns will be generated: .type and .calibration_data
  • .calibration_data includes the actual values, fitted values and the residuals for the testing set
calibrationTable <- modelTable %>%
  modeltime_calibrate(testing(splits))

calibrationTable
# Modeltime Table
# A tibble: 8 × 5
  .model_id .model     .model_desc   .type .calibration_data
      <int> <list>     <chr>         <chr> <list>           
1         1 <fit[+]>   ARIMA         Test  <tibble [36 × 4]>
2         2 <fit[+]>   Prophet       Test  <tibble [36 × 4]>
3         3 <workflow> glmNet        Test  <tibble [36 × 4]>
4         4 <workflow> Random Forest Test  <tibble [36 × 4]>
5         5 <workflow> SVM           Test  <tibble [36 × 4]>
6         6 <workflow> XGBoost       Test  <tibble [36 × 4]>
7         7 <workflow> ARIMA Boost   Test  <tibble [36 × 4]>
8         8 <workflow> Prophet Boost Test  <tibble [36 × 4]>

Forecast (Testing Set)

  • with the calibrated data, we can visualize testing predictions (forecast)
forecastTable <- calibrationTable %>%
  modeltime_forecast(actual_data = vaByMarketTable)

forecastTable
# A tibble: 828 × 7
   .model_id .model_desc .key   .index     .value .conf_lo .conf_hi
       <int> <chr>       <fct>  <date>      <dbl>    <dbl>    <dbl>
 1        NA ACTUAL      actual 1978-01-01   99.0       NA       NA
 2        NA ACTUAL      actual 1978-02-01   87.0       NA       NA
 3        NA ACTUAL      actual 1978-03-01   95.5       NA       NA
 4        NA ACTUAL      actual 1978-04-01   84.2       NA       NA
 5        NA ACTUAL      actual 1978-05-01   89.7       NA       NA
 6        NA ACTUAL      actual 1978-06-01   78.5       NA       NA
 7        NA ACTUAL      actual 1978-07-01   96.4       NA       NA
 8        NA ACTUAL      actual 1978-08-01  113.        NA       NA
 9        NA ACTUAL      actual 1978-09-01   87.8       NA       NA
10        NA ACTUAL      actual 1978-10-01   99.6       NA       NA
# … with 818 more rows

Plotting the forecasts:

forecastPlots <- forecastTable %>%
  filter(.model_desc != "ACTUAL") %>%
  plot_modeltime_forecast(
    .interactive = FALSE,
    .facet_vars = .model_desc,
    .facet_ncol = 4,
    .facet_scales = "fixed",
    .legend_show = FALSE
  ) +
  geom_line(
    data = forecastTable %>%
      filter(.model_desc == "ACTUAL") %>%
      select(.index, .value),
    size = 0.3
  ) +
  labs(
    title = "", y = "No. of Visitors", x = "Year"
  ) +
  theme_bw() +
  theme(legend.position = "none")

ggplotly(forecastPlots)

Another way to plot accuracy:

accuracyTable <- calibrationTable %>%
  modeltime_accuracy() %>%
  select(.model_desc, mae, mape, mase, smape, rmse, rsq) %>%
  reactable(
    columns = list(
      .model_desc = colDef(name = "Model"),
      mae = colDef(name = "Mean Absolute Error", format = colFormat(digits = 2)),
      mape = colDef(name = "Mean Absolute Percentage Error", format = colFormat(digits = 2)),
      mase = colDef(name = "Mean Absolute Scaled Error", format = colFormat(digits = 2)),
      smape = colDef(name = "Symmetric Mean Absolute Percentage Error", format = colFormat(digits = 2)),
      rmse = colDef(name = "Root Mean Squared Error", format = colFormat(digits = 2)),
      rsq = colDef(name = "R-Squared", format = colFormat(digits = 2))
    ),
    highlight = TRUE,
    striped = TRUE,
    searchable = TRUE,
    showPageSizeOptions = FALSE
  )

accuracyTable

Now that we have done forecasting, let’s do some refitting:

refitTable <- calibrationTable %>%
  modeltime_refit(vaByMarketTable) %>%
  modeltime_forecast(h = "3 years",
                     actual_data = vaByMarketTable) %>%
  mutate(.model_desc = str_replace_all(.model_desc, "UPDATE: ARIMA\\(1,1,0\\)\\(2,0,0\\)\\[12\\]", "ARIMA")) %>%
  mutate(.model_desc = str_replace_all(.model_desc, "W/ XGBOOST ERRORS", "Boost"))

refitTable
# A tibble: 828 × 7
   .model_id .model_desc .key   .index     .value .conf_lo .conf_hi
       <int> <chr>       <fct>  <date>      <dbl>    <dbl>    <dbl>
 1        NA ACTUAL      actual 1978-01-01   99.0       NA       NA
 2        NA ACTUAL      actual 1978-02-01   87.0       NA       NA
 3        NA ACTUAL      actual 1978-03-01   95.5       NA       NA
 4        NA ACTUAL      actual 1978-04-01   84.2       NA       NA
 5        NA ACTUAL      actual 1978-05-01   89.7       NA       NA
 6        NA ACTUAL      actual 1978-06-01   78.5       NA       NA
 7        NA ACTUAL      actual 1978-07-01   96.4       NA       NA
 8        NA ACTUAL      actual 1978-08-01  113.        NA       NA
 9        NA ACTUAL      actual 1978-09-01   87.8       NA       NA
10        NA ACTUAL      actual 1978-10-01   99.6       NA       NA
# … with 818 more rows

Now we will use the refitted data and do forward forecasting:

refitPlots <- refitTable %>%
  filter(.model_desc != "ACTUAL") %>%
  plot_modeltime_forecast(
    .interactive = FALSE,
    .facet_vars = .model_desc,
    .facet_ncol = 4,
    .facet_scales = "fixed",
    .legend_show = FALSE
  ) +
  geom_line(
    data = forecastTable %>%
      filter(.model_desc == "ACTUAL") %>%
      select(.index, .value),
    size = 0.3
  ) +
  labs(
    title = ""
  ) +
  theme_bw() +
  theme(legend.position = "none")

ggplotly(refitPlots)

Now we have our forecasts and refit + forecasts. Let’s plot the residuals now:

residualsTable <- calibrationTable %>%
  modeltime_residuals()

residualsPlots <- residualsTable %>%
  plot_modeltime_residuals(
    .interactive = FALSE,
    .type = "timeplot",
    .facet_vars = .model_desc,
    .facet_ncol = 4,
    .facet_scales = "fixed"
  ) +
  labs(
    title = "",
    y = "Residuals"
  ) +
  theme_bw() +
  theme(legend.position = "none")

ggplotly(residualsPlots)

xxx

4. Visitors’ Demographics

4.1. Cleansing the dataset

This file

  • contains the international visitor arrivals by gender and age groups (monthly for the last 6 months of dec)
  • excludes arrivals of Malaysians by land
  • all numbers are counts

The following was done to cleanup the vaByDemoRaw dataframe:

  • remove the bottom few rows as they were unnecessary for our visualizations
  • rename fields and rearrange the columns
  • split the dataframe into two, one to contain info on gender and one to contain info on age
  • next, we will pivot date (month-year) and the number of visitors to reduce the number of columns
vaByDemo <- slice(vaByDemoRaw, 2:11)
vaByDemoGender <- slice(vaByDemo, 1:2)
vaByDemoAge <- slice(vaByDemo, 3:11)

Cleansing Gender dataframe:

colnames(vaByDemoGender)[1] <- "Gender"
vaByDemoGender <- vaByDemoGender %>%
  pivot_longer(cols = !c("Gender"), names_to = "Period", values_to = "Visitors") %>%
  mutate(Period = as.Date(paste(Period, "01"), "%Y %B %d")) %>%
  mutate(Year = year(Period))
vaByDemoGender
# A tibble: 12 × 4
   Gender  Period     Visitors  Year
   <chr>   <date>        <dbl> <dbl>
 1 Males   2022-12-01   457823  2022
 2 Males   2022-11-01   428447  2022
 3 Males   2022-10-01   423112  2022
 4 Males   2022-09-01   419664  2022
 5 Males   2022-08-01   375963  2022
 6 Males   2022-07-01   371596  2022
 7 Females 2022-12-01   473507  2022
 8 Females 2022-11-01   387885  2022
 9 Females 2022-10-01   393712  2022
10 Females 2022-09-01   362542  2022
11 Females 2022-08-01   352779  2022
12 Females 2022-07-01   355136  2022

Cleansing Age dataframe:

colnames(vaByDemoAge)[1] <- "AgeGroup"
vaByDemoAge <- vaByDemoAge %>%
  pivot_longer(cols = !c("AgeGroup"), names_to = "Period", values_to = "Visitors") %>%
  mutate(Period = as.Date(paste(Period, "01"), "%Y %B %d")) %>%
  mutate(Year = year(Period))
vaByDemoAge
# A tibble: 48 × 4
   AgeGroup       Period     Visitors  Year
   <chr>          <date>        <dbl> <dbl>
 1 Under 15 Years 2022-12-01   123734  2022
 2 Under 15 Years 2022-11-01    48421  2022
 3 Under 15 Years 2022-10-01    63829  2022
 4 Under 15 Years 2022-09-01    50827  2022
 5 Under 15 Years 2022-08-01    64064  2022
 6 Under 15 Years 2022-07-01    78914  2022
 7 15-19 Years    2022-12-01    43167  2022
 8 15-19 Years    2022-11-01    16801  2022
 9 15-19 Years    2022-10-01    21290  2022
10 15-19 Years    2022-09-01    19359  2022
# … with 38 more rows

4.2. Preparations for visualizations

To find the age range of most visitors:

visitorsAge <- vaByDemoAge %>%
  group_by(AgeGroup) %>%
  summarise(Visitors = sum(Visitors))

visitorsAge <- visitorsAge[order(visitorsAge$Visitors, decreasing = TRUE), ]
mostAge <- head(visitorsAge$AgeGroup, 1)

mostAge
[1] "25-34 Years"

4.3. Visualizations

pacman::p_load(ggplot2, plotly, timetk, ggHoriPlot, ggridges, gganimate)
vaByDemoGender <- vaByDemoGender %>%
  mutate(Month = month(Period))
vaByDemoGender
# A tibble: 12 × 5
   Gender  Period     Visitors  Year Month
   <chr>   <date>        <dbl> <dbl> <dbl>
 1 Males   2022-12-01   457823  2022    12
 2 Males   2022-11-01   428447  2022    11
 3 Males   2022-10-01   423112  2022    10
 4 Males   2022-09-01   419664  2022     9
 5 Males   2022-08-01   375963  2022     8
 6 Males   2022-07-01   371596  2022     7
 7 Females 2022-12-01   473507  2022    12
 8 Females 2022-11-01   387885  2022    11
 9 Females 2022-10-01   393712  2022    10
10 Females 2022-09-01   362542  2022     9
11 Females 2022-08-01   352779  2022     8
12 Females 2022-07-01   355136  2022     7
vaByDemoAge <- vaByDemoAge %>%
  mutate(Month = month(Period))
vaByDemoAge
# A tibble: 48 × 5
   AgeGroup       Period     Visitors  Year Month
   <chr>          <date>        <dbl> <dbl> <dbl>
 1 Under 15 Years 2022-12-01   123734  2022    12
 2 Under 15 Years 2022-11-01    48421  2022    11
 3 Under 15 Years 2022-10-01    63829  2022    10
 4 Under 15 Years 2022-09-01    50827  2022     9
 5 Under 15 Years 2022-08-01    64064  2022     8
 6 Under 15 Years 2022-07-01    78914  2022     7
 7 15-19 Years    2022-12-01    43167  2022    12
 8 15-19 Years    2022-11-01    16801  2022    11
 9 15-19 Years    2022-10-01    21290  2022    10
10 15-19 Years    2022-09-01    19359  2022     9
# … with 38 more rows

Time Series Plot for Gender

vaByDemoGender %>%
  plot_time_series(Period, Visitors, .color_var = Gender, .smooth = FALSE, .interactive = TRUE)

Another way to plot time series:

p <- vaByDemoGender %>%
  ggplot(aes(x = Period, y = Visitors, color = Gender)) +
  geom_line() +
  labs(x = "Period", y = "No. Of Visitors", title = NULL) +
  theme_bw() +
  theme(axis.text = element_text(size = 7), axis.title = element_text(size = 7), legend.position = "none")

p %>%
  ggplotly()

Time Series Plot for Age Groups

vaByDemoAge %>%
  plot_time_series(Period, Visitors, .color_var = AgeGroup, .smooth = FALSE, .interactive = TRUE)

Horizon Plot by Gender

vaByDemoGender %>%
  group_by(Period, Gender) %>%
  summarise(Visitors = sum(Visitors)) %>%
  ggplot() +
  geom_horizon(aes(x = Period, y =  Visitors), origin = "midpoint", horizonscale = 6) +
  facet_grid(Gender~.) +
  scale_fill_hcl(palette = "RdBu") +
  scale_x_date(expand = c(0,0), date_breaks = "1 month", date_labels = " %b %y") +
  theme_minimal() +
  theme(panel.spacing.y = unit(0, "lines"), 
        strip.text.y = element_text(size = 10, angle = 0, hjust = 0),
        legend.position = 'none',
        axis.text.y = element_blank(),
        axis.text.x = element_text(size = 7),
        axis.title.y = element_blank(),
        axis.title.x = element_blank(),
        axis.ticks.y = element_blank(),
        panel.border = element_blank())

Horizon Plot by Age Group

vaByDemoAge %>%
  group_by(Period, AgeGroup) %>%
  summarise(Visitors = sum(Visitors)) %>%
  ggplot() +
  geom_horizon(aes(x = Period, y =  Visitors), origin = "midpoint", horizonscale = 6) +
  facet_grid(AgeGroup~.) +
  scale_fill_hcl(palette = "RdBu") +
  scale_x_date(expand = c(0,0), date_breaks = "1 month", date_labels = " %b %y") +
  theme_minimal() +
  theme(panel.spacing.y = unit(0, "lines"), 
        strip.text.y = element_text(size = 10, angle = 0, hjust = 0),
        legend.position = 'none',
        axis.text.y = element_blank(),
        axis.text.x = element_text(size = 7),
        axis.title.y = element_blank(),
        axis.title.x = element_blank(),
        axis.ticks.y = element_blank(),
        panel.border = element_blank())

Heatmap by Gender

figHeatmapPlot <- ggplot(data = vaByDemoGender) +
  geom_tile(aes(x = Month, y = Gender, fill = Visitors)) +
  theme_bw() +
  theme(legend.position = "none") +
  scale_x_continuous(breaks = seq_along(month.name), labels = month.abb)

ggplotly(figHeatmapPlot)

Heatmap by Age Group

figHeatmapPlot <- ggplot(data = vaByDemoAge) +
  geom_tile(aes(x = Month, y = AgeGroup, fill = Visitors)) +
  theme_bw() +
  theme(legend.position = "none") +
  scale_x_continuous(breaks = seq_along(month.name), labels = month.abb)

ggplotly(figHeatmapPlot)

xxx

5. Visitors’ Length of Stay

5.1 Cleansing the dataset

This file

  • contains the international visitor arrivals by length of stay (monthly for the last 6 months of dec)
  • excludes arrivals of Malaysians by land
  • all numbers are counts

The following was done to cleanup the vaByStayRaw dataframe:

  • remove the bottom few rows as they were unnecessary for our visualizations
  • rename fields and rearrange the columns
  • next, we will pivot date (month-year) and the number of visitors to reduce the number of columns
  • we will also replace all mention of “(Number)” as we do not need this text explicitly stated in our column
vaByStay <- slice(vaByStayRaw, 2:15)
vaByStay
# A tibble: 14 × 7
   `Data Series`           `2022 Dec` `2022 Nov` 2022 …¹ 2022 …² 2022 …³ 2022 …⁴
   <chr>                        <dbl>      <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
 1 Under 1 Day (Number)        166698     146629  136573  134931  125981  108581
 2 1 Day (Number)              114412     108266  102074   97676   96875   88056
 3 2 Days (Number)             118019     124149  117744  104369  101746   91996
 4 3 Days (Number)             129601     132535  140122  114944  112391  109801
 5 4 Days (Number)              92109      86753   98830   76710   74885   75171
 6 5 Days (Number)              54792      51619   58714   43580   41261   44932
 7 6 Days (Number)              34055      30955   37834   27037   26212   29829
 8 7 Days (Number)              23613      21826   28066   19442   19651   23829
 9 8-10 Days (Number)           30053      25531   33328   24410   26013   30521
10 11-14 Days (Number)          18078      15127   18215   14734   17264   19508
11 15 Days & Over (Number)      34224      32414   34479   33607   43560   44098
12 15-29 Days (Number)          22915      21036   23278   21194   27657   29614
13 30-59 Days (Number)           7458       7646    7415    8175   11112    9747
14 60 Days & Over (Number)       3851       3732    3786    4238    4791    4737
# … with abbreviated variable names ¹​`2022 Oct`, ²​`2022 Sep`, ³​`2022 Aug`,
#   ⁴​`2022 Jul`

Cleansing the dataframe:

colnames(vaByStay)[1] <- "Duration"
vaByStay <- vaByStay %>%
  pivot_longer(cols = !c("Duration"), names_to = "Period", values_to = "Visitors") %>%
  mutate(Period = as.Date(paste(Period, "01"), "%Y %B %d")) %>%
  mutate(Year = year(Period)) %>%
  mutate(Month = month(Period)) %>%
  mutate(Duration = str_replace_all(Duration, " \\(Number\\)", ""))
vaByStay
# A tibble: 84 × 5
   Duration    Period     Visitors  Year Month
   <chr>       <date>        <dbl> <dbl> <dbl>
 1 Under 1 Day 2022-12-01   166698  2022    12
 2 Under 1 Day 2022-11-01   146629  2022    11
 3 Under 1 Day 2022-10-01   136573  2022    10
 4 Under 1 Day 2022-09-01   134931  2022     9
 5 Under 1 Day 2022-08-01   125981  2022     8
 6 Under 1 Day 2022-07-01   108581  2022     7
 7 1 Day       2022-12-01   114412  2022    12
 8 1 Day       2022-11-01   108266  2022    11
 9 1 Day       2022-10-01   102074  2022    10
10 1 Day       2022-09-01    97676  2022     9
# … with 74 more rows

xxx

5.2 Preparations for visualizations

To find the avg days most visitors stayed:

visitorsStayed <- vaByStay %>%
  group_by(Duration) %>%
  summarise(Visitors = sum(Visitors))

visitorsStayed <- visitorsStayed[order(visitorsStayed$Visitors, decreasing = TRUE), ]
mostStayed <- head(visitorsStayed$Duration, 1)

mostStayed
[1] "Under 1 Day"

xxx

5.3 Visualizations

Time Series Plot for Gender

vaByStay %>%
  plot_time_series(Period, Visitors, .smooth = FALSE, .color_var = Duration, .interactive = TRUE)

Time Series Boxplot for Length of Stay

vaByStay %>%
  plot_time_series_boxplot(Period, Visitors, .smooth = FALSE, .period = "1 month", .interactive = TRUE, .title = NULL)

Time Series Analysis of Length of Stay

p <- vaByStay %>%
  ggplot(aes(x = Period, y = Visitors, color = Duration)) +
  geom_line() +
  labs(x = "Period", y = "No. Of Visitors", title = NULL) +
  theme_bw() +
  theme(axis.text = element_text(size = 7), axis.title = element_text(size = 7))

p %>%
  ggplotly()

Ridgeline Plot by Length of Stay

vaByStay %>%
  ggplot(aes(x = Visitors, y = Duration, fill = after_stat(x))) +
  geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01) +
  scale_fill_viridis_c(name = "Visitors", option = "C") +
  theme_bw()

pacman::p_load(viridis, hrbrthemes)

Area Chart by Length of Stay

areaChart <- vaByStay %>% 
  ggplot(aes(x = Period, y = Visitors, fill = Duration)) +
    geom_area() +
    scale_fill_viridis(discrete = TRUE) +
    theme(legend.position="none") +
    theme_bw() +
    theme(legend.position="none")

ggplotly(areaChart)

Horizon Plot by Length of Stay

vaByStay %>%
  group_by(Period, Duration) %>%
  summarise(Visitors = sum(Visitors)) %>%
  ggplot() +
  geom_horizon(aes(x = Period, y =  Visitors), origin = "midpoint", horizonscale = 6) +
  facet_grid(Duration~.) +
  scale_fill_hcl(palette = "RdBu") +
  scale_x_date(expand = c(0,0), date_breaks = "1 month", date_labels = " %b %y") +
  theme_minimal() +
  theme(panel.spacing.y = unit(0, "lines"), 
        strip.text.y = element_text(size = 10, angle = 0, hjust = 0),
        legend.position = 'none',
        axis.text.y = element_blank(),
        axis.text.x = element_text(size = 7),
        axis.title.y = element_blank(),
        axis.title.x = element_blank(),
        axis.ticks.y = element_blank(),
        panel.border = element_blank())

Heatmap by Length of Stay

figHeatmapLOS <- ggplot(vaByStay) +
  geom_tile(aes(x = Month, y = Duration, fill = Visitors)) +
  theme_bw() +
  theme(legend.position = "none") +
  scale_x_continuous(breaks = seq_along(month.name), labels = month.abb)

ggplotly(figHeatmapLOS)

Using ggTimeSeries

pacman::p_load(ggTimeSeries)

Streamgraph for Length of Stay (doesn’t look that nice)

p1 <- vaByStay %>%
  ggplot(aes(x = Period, y = Visitors, group = Duration, fill = Duration)) +
  stat_steamgraph()
ggplotly(p1)

using ggbump

pacman::p_load(ggbump)

Bump Chart for Length of Stay

px <- vaByStay %>% 
  ggplot(aes(x = Period, y = Visitors, color = Duration)) +
  geom_bump(size = 1) +
  geom_point(size = 1.2)

ggplotly(px)

xxx